#include "opencl_source_map.hpp" 
namespace MNN { 
#ifndef MNN_OPENCL_BUFFER_CLOSED
const char* pooling_buf = 
"#ifdef MNN_SUPPORT_FP16\n"
"#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
"#endif\n"
"#define GLOBAL_SIZE_3_DIMS "" __private const int global_size_dim0,__private const int global_size_dim1,__private const int global_size_dim2,\n"
"#define DEAL_NON_UNIFORM_DIM3(input1, input2, input3) "" if (input1 >= global_size_dim0 || input2 >= global_size_dim1 || input3 >= global_size_dim2) { "" return; "" }\n"
"__kernel void pooling(GLOBAL_SIZE_3_DIMS __global const FLOAT *input,\n"
" __private const int2 input_shape,\n"
" __private const int2 output_shape,\n"
" __private const int2 pad_shape,\n"
" __private const int2 stride_shape,\n"
" __private const int2 kernel_shape,\n"
" __global FLOAT *output,\n"
" __global FLOAT *rediceOutput,\n"
" __private const int batch) {\n"
" \n"
" const int ow_idx=get_global_id(0);\n"
" const int b_oh_idx=get_global_id(1);\n"
" const int c_idx=get_global_id(2);\n"
" DEAL_NON_UNIFORM_DIM3(ow_idx,b_oh_idx,c_idx);\n"
" \n"
" const int b_idx=b_oh_idx/output_shape.x;\n"
" const int oh_idx=b_oh_idx % output_shape.x;\n"
" const int iw_start=mad24(ow_idx,stride_shape.y,-pad_shape.y);\n"
" const int ih_start=mad24(oh_idx,stride_shape.x,-pad_shape.x);\n"
" \n"
" #ifdef RETURN_REDICE\n"
" int4 redice=(int4)0;\n"
" #endif\n"
" #ifdef POOL_AVG\n"
" COMPUTE_FLOAT4 result=(COMPUTE_FLOAT4)(0);\n"
" const int inp_offset=(((b_idx+c_idx*batch)*input_shape.x+ih_start)*input_shape.y+iw_start)*4;\n"
" #ifdef COUNT_INCLUDE_PADDING\n"
" int total_count=(min(ih_start+kernel_shape.x,input_shape.x+pad_shape.x)-ih_start)*(min(iw_start+kernel_shape.y,input_shape.y+pad_shape.y)-iw_start);\n"
" #else\n"
" int total_count=0;\n"
" #endif\n"
" for(int kh=0; kh<kernel_shape.x; kh++) {\n"
" int ih_cur=ih_start+kh;\n"
" if(ih_cur<0 || ih_cur >= input_shape.x) {\n"
" continue;\n"
" }\n"
" for(int kw=0; kw<kernel_shape.y; kw++) {\n"
" int iw_cur=iw_start+kw;\n"
" if(iw_cur<0 || iw_cur >= input_shape.y) {\n"
" continue;\n"
" }\n"
" COMPUTE_FLOAT4 inp_data=CONVERT_COMPUTE_FLOAT4(vload4(0,input+inp_offset+(kh*input_shape.y+kw)*4));\n"
" result += inp_data;\n"
" #ifndef COUNT_INCLUDE_PADDING\n"
" total_count++;\n"
" #endif\n"
" }\n"
" }\n"
" result=result/(COMPUTE_FLOAT4)(1.0*total_count);\n"
" #else\n"
" COMPUTE_FLOAT4 result=(COMPUTE_FLOAT4)(-FLT_MAX);\n"
" const int inp_offset=(((b_idx+c_idx*batch)*input_shape.x+ih_start)*input_shape.y+iw_start)*4;\n"
" for(int kh=0; kh<kernel_shape.x; kh++) {\n"
" int ih_cur=ih_start+kh;\n"
" if(ih_cur<0 || ih_cur >= input_shape.x) {\n"
" continue;\n"
" }\n"
" for(int kw=0; kw<kernel_shape.y; kw++) {\n"
" int iw_cur=iw_start+kw;\n"
" if(iw_cur<0 || iw_cur >= input_shape.y) {\n"
" continue;\n"
" }\n"
" COMPUTE_FLOAT4 inp_data=CONVERT_COMPUTE_FLOAT4(vload4(0,input+inp_offset+(kh*input_shape.y+kw)*4));\n"
" #ifdef RETURN_REDICE\n"
" redice=inp_data>result ? (int4)((ih_start+kh)*input_shape.y+iw_start+kw) : redice;\n"
" #endif\n"
" result=fmax(result,inp_data);\n"
" }\n"
" }\n"
" #endif\n"
" \n"
" const int out_offset=(((b_idx+c_idx*batch)*output_shape.x+oh_idx)* output_shape.y+ow_idx)*4;\n"
" vstore4(CONVERT_FLOAT4(result),0,output+out_offset);\n"
" #ifdef RETURN_REDICE\n"
" vstore4(CONVERT_FLOAT4(redice),0,rediceOutput+out_offset);\n"
" #endif\n"
"}\n"
"#ifdef LOCAL_SIZE\n"
"__kernel void global_pooling_buf(GLOBAL_SIZE_3_DIMS __global const FLOAT *input,\n"
" __private const int2 input_shape,\n"
" __private const int2 output_shape,\n"
" __private const int2 pad_shape,\n"
" __private const int2 stride_shape,\n"
" __private const int2 kernel_shape,\n"
" __global FLOAT *output,\n"
" __global FLOAT *rediceOutput,\n"
" __private const int batch) {\n"
" const int local_id=get_local_id(0);\n"
" const int output_channel_idx=get_global_id(1);\n"
" const int output_batch_idx=get_global_id(2);\n"
"#ifdef POOL_AVG\n"
" COMPUTE_FLOAT4 output_result=0;\n"
"#else\n"
" COMPUTE_FLOAT4 output_result=(COMPUTE_FLOAT4)(-FLT_MAX);\n"
"#endif\n"
"#ifdef RETURN_REDICE\n"
" int4 redice=(int4)0;\n"
" int4 local rediceId[LOCAL_SIZE];\n"
"#endif\n"
" COMPUTE_FLOAT4 local sum_mnn[LOCAL_SIZE];\n"
" const int inp_offset=((output_batch_idx+output_channel_idx*batch)*input_shape.x)*input_shape.y*4;\n"
" const int size=input_shape.x*input_shape.y;\n"
" for(int i=local_id; i<size; i+=LOCAL_SIZE){\n"
" int w=i % input_shape.y;;\n"
" int h=i/input_shape.y;\n"
" COMPUTE_FLOAT4 in=CONVERT_COMPUTE_FLOAT4(vload4(0,input+inp_offset+(h*input_shape.y+w)*4));\n"
"#ifdef POOL_AVG\n"
" output_result += in;\n"
"#else\n"
" output_result=fmax(output_result,in);\n"
"#ifdef RETURN_REDICE\n"
" redice=in>output_result ? (int4)(i) : redice;\n"
"#endif\n"
"#endif\n"
" }\n"
" \n"
" sum_mnn[local_id]=output_result;\n"
"#ifdef RETURN_REDICE\n"
" rediceId[local_id]=redice;\n"
"#endif\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" for(int i=LOCAL_SIZE/2; i>0; i /= 2){\n"
" if (local_id<i)\n"
"#ifdef POOL_AVG\n"
" sum_mnn[local_id]=sum_mnn[local_id]+sum_mnn[local_id+i];\n"
"#else\n"
" {\n"
"#ifdef RETURN_REDICE\n"
" rediceId[local_id]=sum_mnn[local_id]>sum_mnn[local_id+i] ? rediceId[local_id] : rediceId[local_id+i];\n"
"#endif\n"
" sum_mnn[local_id]=fmax(sum_mnn[local_id],sum_mnn[local_id+i]);\n"
" }\n"
"#endif\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" }\n"
" output_result=sum_mnn[0];\n"
"#ifdef POOL_AVG\n"
" output_result /= (input_shape.x*input_shape.y);\n"
"#endif\n"
" const int out_offset=(output_batch_idx+output_channel_idx*batch)*4;\n"
" vstore4(CONVERT_FLOAT4(output_result),0,output+out_offset);\n"
"#ifdef RETURN_REDICE\n"
" redice=rediceId[0];\n"
" vstore4(CONVERT_FLOAT4(redice),0,rediceOutput+out_offset);\n"
"#endif\n"
"}\n"
"#endif\n"
;
#endif
}
